Read in tree

tree <- read.tree('~/serverUA/sander/carrot_isolates/06_phylogenetic_tree/raxml_out/RAxML_bestTree.carrotisolates')

# Root tree
out_ex_tips_names <- c("GCA-001437015", "GCA-001438885")

out_ex_tips <- which(tree$tip.label %in% out_ex_tips_names)
out_mrca <- findMRCA(tree, tip = out_ex_tips)
tree <- reroot(tree, node.number = out_mrca, position = 0.5)

p <- ggtree(tree) +  geom_tree()
p

# Get the label order
d <- fortify(tree)
dd <- subset(d, isTip)
order <- dd$label[order(dd$y,decreasing=T)] %>%
  str_replace("GCA-", "GCA_")

General hits

ggplot(dbcan, aes(x = evalue, y = coverage)) +
  geom_point() +
  scale_x_log10() +
  geom_vline(xintercept = 1e-15,
             yintercept = 0.35)

Filter

Apply dbCAN cutoff

dbcan_filtered <- dbcan %>%
  filter(evalue < 1e-15,
         coverage > 0.35) %>%
  mutate(Assembly = as_factor(Assembly),
         Assembly = fct_relevel(Assembly, order))

How many per genome

dbcan_filtered %>%
  add_count(Assembly) %>%
  select(Assembly, speciesClade, n) %>%
  distinct() %>%
  mutate(speciesClade = if_else(is.na(speciesClade), "Z other", speciesClade)) %>%
  ggplot(aes(x = Assembly, y = n, fill = speciesClade)) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(palette = "Paired")

GHs only

dbcan_filtered %>%
  filter(str_detect(queryName, "GH")) %>%
  add_count(Assembly) %>%
  select(Assembly, speciesClade, n) %>%
  distinct() %>%
  mutate(speciesClade = if_else(is.na(speciesClade), "Z other", speciesClade)) %>%
  ggplot(aes(x = Assembly, y = n, fill = speciesClade)) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(palette = "Paired")

Heatmap all

GH_heatmap <- dbcan_filtered %>%
  filter(str_detect(queryName, "GH")) %>%
  select(Assembly, queryName) %>%
  add_count(Assembly, queryName) %>%
  distinct() %>%
  spread(key = queryName, value = n) %>%
  replace(is.na(.), 0) %>%
  as.data.frame()

rownames(GH_heatmap) <- GH_heatmap$Assembly %>%
  str_replace_all("GCA_","GCA-")

GH_heatmap <- GH_heatmap %>% select(-Assembly)

gheatmap(p, GH_heatmap, width=15, colnames = T, color=NA,offset=0.1) + 
  ggtitle("GH") + 
  scale_fill_gradient(high="#132B43",low="#56B1F7") 
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

Only isolates

dbcan_filtered %>%
  filter(!is.na(speciesClade)) %>%
  filter(str_detect(queryName, "GH")) %>%
  add_count(Assembly) %>%
  select(Assembly, speciesClade, n) %>%
  distinct() %>%
  ggplot(aes(x = Assembly, y = n, fill = speciesClade)) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(palette = "Paired")

Isolates only heatmap. This code is extremely ugly. Needs cleaning up.

plotdf <- dbcan_filtered %>%
  filter(str_detect(queryName, "GH"),
         !is.na(speciesClade)) %>%
  select(Assembly, queryName, speciesClade) %>%
  add_count(Assembly, queryName) %>%
  distinct() %>%
  spread(queryName, n) %>%
  replace(is.na(.), 0) %>%
  gather(key = "queryName", value = 'n', -Assembly, -speciesClade) %>%
  mutate(value = cut(n,
                     breaks = c(-Inf,0, 2, 4, 6, 8, 10, Inf),
                     labels = c("0","1 - 2","2 - 4", "4 - 6","6 - 8", "8 - 10", ">10"))) %>% # Convert to discrete
  mutate(value = factor(as.character(value), levels=rev(levels(value)))) 

# Label formatting of y-axis
myLabels <- plotdf %>%
  left_join(annotation) %>%
  mutate(Name = paste0('italic(', genus ,')~italic(', species, ')~', strain)) %>%
  pull(Name) %>%
  unique 
## Joining, by = c("Assembly", "speciesClade")
# Map labels to new variable
plotdf$Assembly_renamed <- mapvalues(plotdf$Assembly, unique(plotdf$Assembly),
                                  parse(text = myLabels))

# Reorder the speciesClades according to phylogeny
speciesCladeOrder <- tibble(Assembly = order) %>%
  left_join(annotation %>% select(Assembly, speciesClade)) %>%
  filter(!is.na(speciesClade)) %>%
  pull(speciesClade) %>%
  unique() 
## Joining, by = "Assembly"
plotdf <- plotdf %>%
  mutate(speciesClade = factor(speciesClade, levels = speciesCladeOrder))

# Reorder queryName
queryNameorder <- plotdf %>%
  mutate(Number = str_replace_all(queryName, "GH", "")) %>%
  separate(Number, into= c("Number", "Number2")) %>%
  mutate(Number = as.integer(Number)) %>%
  arrange(Number, Number2) %>%
  pull(queryName) %>%
  unique()

plotdf <- plotdf %>%
  mutate(queryName = factor(queryName, levels = queryNameorder))

# Finally plot
heatmap <- plotdf %>%
  
  ggplot(aes(x  = queryName, y = Assembly_renamed, fill = value)) +
  geom_tile(colour = "white", size = 0.25) +
  labs(x = "", y = "") +
  facet_grid(speciesClade~., scales = "free_y", space = "free_y") +
  scale_y_discrete(expand=c(0,0), labels = function(x) parse(text = x)) +
  #scale_fill_manual(values=c("#d53e4f","#f46d43","#fdae61", "#fee08b","#e6f598","#abdda4","#ddf1da")) +
  scale_fill_manual(values=rev(brewer.pal(7,"YlGnBu"))) +
  theme_grey(base_size=8) +
  #ggtitle("Glycosyl hydrolases") +
  guides(fill = guide_legend(nrow = 1, label.position = "bottom", label.vjust = -0.2, reverse = T)) +
  theme(
    #remove legend title
    legend.title=element_blank(),
    #remove legend margin
    legend.margin = grid::unit(0,"cm"),
    #change legend text properties
    legend.text=element_text(size=7,face="bold"),
    #change legend key height
    legend.key.height=grid::unit(0.2,"cm"),
    #set a slim legend
    legend.key.width=grid::unit(0.8,"cm"),
    legend.position = "bottom",
    #set x axis text size and colour
    axis.text.x=element_text(size=6, angle = 90, hjust = 1, vjust = 0.5),
    #set y axis text colour and adjust vertical justification
    axis.text.y=element_text(size = 8, vjust = 0.2, margin = margin(r = 5)),
    #change axis ticks thickness
    axis.ticks=element_line(size=0.4),
    #change title font, size, colour and justification
    plot.title=element_text(hjust=0,size=14,face="bold"),
    #remove plot background
    plot.background=element_blank(),
    #remove plot border
    panel.border=element_blank(),
    #remove facet_grid boxes
    strip.background = element_blank(),
    #remove facet_grid text
    strip.text.y = element_blank(),
    plot.margin = unit(c(0,0,0,0), "cm"))

heatmap

Bar grahps

barplotY <- plotdf %>%
  group_by(Assembly_renamed, speciesClade) %>%
  summarize(sum = sum(n)) %>%
  ggplot(aes(x = Assembly_renamed, y = sum)) +
  geom_col(fill = "#d95f02", alpha = 0.8) +
  facet_grid(speciesClade~., scales = "free_y", space = 'free_y') + 
  coord_flip() +
  xlab("") +
  ylab("# of GH") +
  theme_minimal() + 
  scale_y_continuous(expand = c(0,0)) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 6, angle = 90, hjust = 1, vjust = 0.5),
        strip.text = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank())

barplotY

barplotX <- plotdf %>%
  group_by(queryName) %>%
  summarize(sumX = sum(n)) %>%
  ggplot(aes(x = queryName, y = sumX)) +
  geom_col(fill = "#d95f02", alpha = 0.8) +
  xlab("") +
  ylab("# of GH") +
  ggtitle("Glycosyl hydrolases") +
  theme_minimal() + 
  theme(axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 6),
        strip.text = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"),
        plot.title=element_text(hjust=0,size=14,face="bold"))

barplotX

Add bargraphs.

ggempty <- plotdf %>%
  ggplot(aes(x = queryName, y = n)) +
  geom_blank() +
  labs(x="", y="") +
  theme(panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank())

GHS <- ggarrange(barplotX,
          ggempty,
          heatmap,
          barplotY,
          ncol = 2,
          nrow = 2,
          widths = c(6,1),
          heights = c(1,8),
          padding = unit(0, "cm"))

GHS

ggsave("GHs.pdf",GHS, units = "cm", width = 21, height = 29.7)

Number of different GHs

dbcan_filtered %>%
  filter(str_detect(queryName, "GH"),
                  !is.na(speciesClade)) %>%
  separate(queryName, into= c("queryName", "Number2")) %>%
  pull(queryName) %>%
  unique() %>%
  length
## [1] 41

Sum ranges

plotdf %>%
  group_by(Assembly_renamed, speciesClade) %>%
  summarize(sum = sum(n)) %>%
  group_by(speciesClade) %>%
  summarize(minSum = min(sum), maxSum = max(sum)) %>%
  arrange(-maxSum)
## # A tibble: 8 x 3
##   speciesClade   minSum maxSum
##   <fct>           <dbl>  <dbl>
## 1 Unclassified 2     59     61
## 2 plantarum          37     54
## 3 Unclassified 1     50     50
## 4 paraplantarum      36     43
## 5 paracasei          20     39
## 6 mesenteroides      30     35
## 7 brevis             22     30
## 8 citreum            27     29

Sum cols

plotdf %>%
  group_by(queryName) %>%
  summarize(sum = sum(n)) %>%
  arrange(-sum)
## # A tibble: 50 x 2
##    queryName   sum
##    <fct>     <dbl>
##  1 GH1         398
##  2 GH25        333
##  3 GH13_31     253
##  4 GH65        205
##  5 GH73        141
##  6 GH31         99
##  7 GH36         85
##  8 GH13_20      79
##  9 GH2          75
## 10 GH32         64
## # ... with 40 more rows

GTs only

dbcan_filtered %>%
  filter(str_detect(queryName, "GT")) %>%
  add_count(Assembly) %>%
  select(Assembly, speciesClade, n) %>%
  distinct() %>%
  mutate(speciesClade = if_else(is.na(speciesClade), "Z other", speciesClade)) %>%
  ggplot(aes(x = Assembly, y = n, fill = speciesClade)) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(palette = "Paired")

Only isolates

dbcan_filtered %>%
  filter(!is.na(speciesClade)) %>%
  filter(str_detect(queryName, "GT")) %>%
  add_count(Assembly) %>%
  select(Assembly, speciesClade, n) %>%
  distinct() %>%
  ggplot(aes(x = Assembly, y = n, fill = speciesClade)) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(palette = "Paired")

Isolates only heatmap. This code is extremely ugly. Needs cleaning up.

plotdf <- dbcan_filtered %>%
  filter(str_detect(queryName, "GT"),
         !is.na(speciesClade)) %>%
  select(Assembly, queryName, speciesClade) %>%
  add_count(Assembly, queryName) %>%
  distinct() %>%
  spread(queryName, n) %>%
  replace(is.na(.), 0) %>%
  gather(key = "queryName", value = 'n', -Assembly, -speciesClade) %>%
  mutate(value = cut(n,
                     breaks = c(-Inf,0, 2, 4, 6, 8, 10, Inf),
                     labels = c("0","1 - 2","2 - 4", "4 - 6","6 - 8", "8 - 10", ">10"))) %>% # Convert to discrete
  mutate(value = factor(as.character(value), levels=rev(levels(value)))) 

# Label formatting of y-axis
myLabels <- plotdf %>%
  left_join(annotation) %>%
  mutate(Name = paste0('italic(', genus ,')~italic(', species, ')~', strain)) %>%
  pull(Name) %>%
  unique 
## Joining, by = c("Assembly", "speciesClade")
# Map labels to new variable
plotdf$Assembly_renamed <- mapvalues(plotdf$Assembly, unique(plotdf$Assembly),
                                  parse(text = myLabels))

# Reorder the speciesClades according to phylogeny
speciesCladeOrder <- tibble(Assembly = order) %>%
  left_join(annotation %>% select(Assembly, speciesClade)) %>%
  filter(!is.na(speciesClade)) %>%
  pull(speciesClade) %>%
  unique() 
## Joining, by = "Assembly"
plotdf <- plotdf %>%
  mutate(speciesClade = factor(speciesClade, levels = speciesCladeOrder))

# Reorder queryName
queryNameorder <- plotdf %>%
  mutate(Number = str_replace_all(queryName, "GT", "")) %>%
  separate(Number, into= c("Number", "Number2")) %>%
  mutate(Number = as.integer(Number)) %>%
  arrange(Number, Number2) %>%
  pull(queryName) %>%
  unique()

plotdf <- plotdf %>%
  mutate(queryName = factor(queryName, levels = queryNameorder))

# Finally plot
heatmap <- plotdf %>%
  
  ggplot(aes(x  = queryName, y = Assembly_renamed, fill = value)) +
  geom_tile(colour = "white", size = 0.25) +
  labs(x = "", y = "") +
  facet_grid(speciesClade~., scales = "free_y", space = "free_y") +
  scale_y_discrete(expand=c(0,0), labels = function(x) parse(text = x)) +
  #scale_fill_manual(values=c("#d53e4f","#f46d43","#fdae61", "#fee08b","#e6f598","#abdda4","#ddf1da")) +
  scale_fill_manual(values=rev(brewer.pal(7,"YlGnBu"))) +
  theme_grey(base_size=8) +
  #ggtitle("Glycosyl hydrolases") +
  guides(fill = guide_legend(nrow = 1, label.position = "bottom", label.vjust = -0.2, reverse = T)) +
  theme(
    #remove legend title
    legend.title=element_blank(),
    #remove legend margin
    legend.margin = grid::unit(0,"cm"),
    #change legend text properties
    legend.text=element_text(size=7,face="bold"),
    #change legend key height
    legend.key.height=grid::unit(0.2,"cm"),
    #set a slim legend
    legend.key.width=grid::unit(0.8,"cm"),
    legend.position = "bottom",
    #set x axis text size and colour
    axis.text.x=element_text(size=8, angle = 90, hjust = 1, vjust = 0.5),
    #set y axis text colour and adjust vertical justification
    axis.text.y=element_text(size = 8, vjust = 0.2, margin = margin(r = 5)),
    #change axis ticks thickness
    axis.ticks=element_line(size=0.4),
    #change title font, size, colour and justification
    plot.title=element_text(hjust=0,size=14,face="bold"),
    #remove plot background
    plot.background=element_blank(),
    #remove plot border
    panel.border=element_blank(),
    #remove facet_grid boxes
    strip.background = element_blank(),
    #remove facet_grid text
    strip.text.y = element_blank(),
    plot.margin = unit(c(0,0,0,0), "cm"))

heatmap

Bar grahps

barplotY <- plotdf %>%
  group_by(Assembly_renamed, speciesClade) %>%
  summarize(sum = sum(n)) %>%
  ggplot(aes(x = Assembly_renamed, y = sum)) +
  geom_col(fill = "#d95f02", alpha = 0.8) +
  facet_grid(speciesClade~., scales = "free_y", space = 'free_y') + 
  coord_flip() +
  xlab("") +
  ylab("# of GT") +
  theme_minimal() + 
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 6, angle = 90, hjust = 1, vjust = 0.5),
        strip.text = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"))

barplotY

barplotX <- plotdf %>%
  group_by(queryName) %>%
  summarize(sumX = sum(n)) %>%
  ggplot(aes(x = queryName, y = sumX)) +
  geom_col(fill = "#d95f02", alpha = 0.8) +
  xlab("") +
  ylab("# of GT") +
  ggtitle("Glycosyltransferases") +
  theme_minimal() + 
  theme(axis.text.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 6),
        strip.text = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        plot.margin = unit(c(0,0,0,0), "cm"),
        plot.title=element_text(hjust=0,size=14,face="bold"))

barplotX

Add bargraphs.

ggempty <- plotdf %>%
  ggplot(aes(x = queryName, y = n)) +
  geom_blank() +
  labs(x="", y="") +
  theme(panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank())

ggarrange(barplotX,
          ggempty,
          heatmap,
          barplotY,
          ncol = 2,
          nrow = 2,
          widths = c(6,1),
          heights = c(1,8)) +
  theme(plot.margin = unit(c(0,0,0,0), "cm"))

## NULL

Number of different GTs

dbcan_filtered %>%
  filter(str_detect(queryName, "GT"),
                  !is.na(speciesClade)) %>%
  separate(queryName, into= c("queryName", "Number2")) %>%
  pull(queryName) %>%
  unique() %>%
  length
## [1] 13

Sum ranges

plotdf %>%
  group_by(Assembly_renamed, speciesClade) %>%
  summarize(sum = sum(n)) %>%
  group_by(speciesClade) %>%
  summarize(minSum = min(sum), maxSum = max(sum)) %>%
  arrange(-maxSum)
## # A tibble: 8 x 3
##   speciesClade   minSum maxSum
##   <fct>           <dbl>  <dbl>
## 1 plantarum          26     37
## 2 paraplantarum      29     31
## 3 Unclassified 1     29     29
## 4 paracasei          21     27
## 5 Unclassified 2     21     25
## 6 citreum            17     21
## 7 brevis             11     20
## 8 mesenteroides      17     20

Sum cols

plotdf %>%
  group_by(queryName) %>%
  summarize(sum = sum(n)) %>%
  arrange(-sum)
## # A tibble: 13 x 2
##    queryName   sum
##    <fct>     <dbl>
##  1 GT2         596
##  2 GT4         580
##  3 GT51        134
##  4 GT28         66
##  5 GT26         52
##  6 GT5          39
##  7 GT35         38
##  8 GT8          21
##  9 GT32         18
## 10 GT83         10
## 11 GT14          6
## 12 GT81          1
## 13 GT101         1

Cluster analysis GTs

gene_coordinates <- read.table("out_geneTables/geneTable_allGenomes.tsv", sep = ' ', col.names = c("contig", "genome", "start", "end", "strand", "gene")) %>%
  mutate(genome = str_replace(genome, "AMB-F", "AMBF")) %>%
  filter(genome %in% (dbcan_filtered %>% filter(!is.na(speciesClade)) %>% pull(Assembly))) %>%
  mutate(gene = str_sub(gene, 4, -1)) 
GTs <- dbcan_filtered %>%
  filter(str_detect(queryName, "GT"),
         !is.na(speciesClade)) %>%
  rename(gene = HMM) %>%
  left_join(gene_coordinates) %>%
  mutate(geneNumber = gene) %>%
  separate(geneNumber, into = c("X", "geneNumber")) %>%
  select(-X)
## Joining, by = "gene"
ggplot(GTs, aes(x = geneNumber, y = genome)) +
  geom_point() +
  facet_grid(speciesClade~., scales = "free_y", space = "free_y") 

We will first look at the dispersion of GTs. How far are they away from each othe?

GTs %>%
  group_by(contig) %>%
  mutate(diffToPrev = as.numeric(geneNumber) - as.numeric(lag(geneNumber)),
         difftoNext = as.numeric(lead(geneNumber)) - as.numeric(geneNumber)) %>%
  filter(diffToPrev < 50,
         difftoNext < 50) %>%
  gather(key = "key", value = "difference", starts_with("diffto")) %>%
  ggplot(aes(x = difference, fill = key)) +
  geom_density() +
  facet_wrap(~key, nrow = 2, scales = "free")

The majority of them show a difference of 1, meaning they have a GT close by. In addition, 10 might be a good cut off to use here. So here we will cluster all GTs together if they are at maximum 10 genes apart.

# Set cutoff
cutoff <- 10

# Initiate table
clustersdf <- tibble(contig = character(), genes = character())

for (Contig in as.character(unique(GTs$contig))) {
  geneNumbers <- GTs %>%
    filter(contig == Contig) %>%
    pull(geneNumber) %>%
    as.numeric()
  
  clusters <- list(c(geneNumbers[1]))
  
  if (length(geneNumbers) > 1) {
    for (i in 2:length(geneNumbers)) {
      el <- geneNumbers[i]
      
      d <- el - geneNumbers[i - 1]
      
      if (d < cutoff) {
        clusters[[length(clusters)]] <- c(clusters[[length(clusters)]], el)
      } else {
        clusters[[length(clusters) + 1]] <- c(el)
      }
    }
  }
  clustersdf <- clustersdf %>%
    add_row(contig = Contig, genes = clusters)
}
clustersdf <- clustersdf %>%
  mutate(genomeName = contig) %>%
  separate(genomeName, into = c("genomeName", "X")) %>%
  select(-X) %>%
  group_by(genomeName) %>%
  mutate(clustername = str_c(str_sub(contig, 1, 8), "_cluster_", 1:length(contig))) %>%
  unnest(genes) %>%
  mutate(gene = if_else(
    genes < 10,
    str_c(genomeName, "_", "0000", as.character(genes)),
      if_else(
        genes > 9 &
          genes < 100,
        str_c(genomeName, "_", "000", as.character(genes)),
        if_else(
          genes > 99 &
            genes < 1000,
          str_c(genomeName, "_", "00", as.character(genes)),
          str_c(genomeName, "_", "0", as.character(genes))
        )
      )
    )
  ) %>%
  #select(-genes) %>%
  left_join(dbcan_filtered %>% rename(gene = HMM))
## Joining, by = "gene"
GTcount <- clustersdf %>%
  group_by(clustername) %>%
  summarise(count = n()) %>%
  left_join(clustersdf %>% select(clustername, Assembly, speciesClade)) %>%
  distinct()
## Adding missing grouping variables: `genomeName`
## Joining, by = "clustername"
GTcount %>%
  ggplot(aes(x = clustername, y = count)) +
  geom_col() +
  facet_wrap(speciesClade~Assembly, scales = "free_x")

How are they distributed?

GTcount %>%
  group_by(count) %>%
  summarise(countcount = n()) %>%
  mutate(percentage = countcount/sum(countcount))
## # A tibble: 6 x 3
##   count countcount percentage
##   <int>      <int>      <dbl>
## 1     1        836    0.724  
## 2     2        247    0.214  
## 3     3         61    0.0529 
## 4     4          4    0.00347
## 5     5          3    0.00260
## 6     6          3    0.00260

Most of the GTs are found solo, while 247 were detected with two. Only 6 showed a GT count > 4. Same but per species

GTcount %>%
  group_by(speciesClade, count) %>%
  summarise(countcount = n()) %>%
  mutate(percentage = countcount/sum(countcount))
## # A tibble: 28 x 4
## # Groups:   speciesClade [8]
##    speciesClade  count countcount percentage
##    <chr>         <int>      <int>      <dbl>
##  1 brevis            1        201    0.827  
##  2 brevis            2         41    0.169  
##  3 brevis            3          1    0.00412
##  4 citreum           1         18    0.692  
##  5 citreum           2          7    0.269  
##  6 citreum           6          1    0.0385 
##  7 mesenteroides     1         37    0.597  
##  8 mesenteroides     2         20    0.323  
##  9 mesenteroides     3          4    0.0645 
## 10 mesenteroides     5          1    0.0161 
## # ... with 18 more rows

Min and max per species?

GTcount %>%
  group_by(speciesClade) %>%
  summarise(min = min(count), max = max(count))
## # A tibble: 8 x 3
##   speciesClade     min   max
##   <chr>          <dbl> <dbl>
## 1 brevis             1     3
## 2 citreum            1     6
## 3 mesenteroides      1     5
## 4 paracasei          1     3
## 5 paraplantarum      1     3
## 6 plantarum          1     6
## 7 Unclassified 1     1     2
## 8 Unclassified 2     1     6
clustersdf %>%
  filter(clustername %in% (GTcount %>% filter(count > 1) %>% pull(clustername))) %>%
  ggplot(aes(x = clustername)) +
  geom_bar(aes(fill = queryName)) +
  facet_wrap(speciesClade~Assembly, scales = "free_x") +
  scale_fill_brewer(palette = "Paired")

Other visualisation

integer_breaks <- function(n = 2, ...) {
  breaker <- pretty_breaks(n, ...)
  function(x) {
     breaks <- breaker(x)
     breaks[breaks == floor(breaks)]
  }
}

myLabels <- GTcount %>%
  left_join(annotation) %>%
  mutate(Name = paste0('italic(', genus ,')~italic(', species, ')~', strain)) %>%
  pull(Name) %>%
  unique 
## Joining, by = c("Assembly", "speciesClade")
GTcount %>%
  mutate(Assembly_renamed = mapvalues(GTcount$Assembly, unique(GTcount$Assembly),
                                  parse(text = myLabels))) %>%
  mutate(speciesClade = factor(speciesClade, levels = speciesCladeOrder)) %>%
  mutate(count = if_else(count == 1, "1 GT", str_c(count, " GTs in cluster"))) %>%
  ggplot(aes(x = Assembly_renamed)) +
  geom_bar(fill = "#d95f02", col = "#d95f02") +
  coord_flip() +
  facet_grid(speciesClade~count, scales = "free", space = "free") +
  scale_x_discrete(expand=c(0,0), labels = function(x) parse(text = x)) +
  scale_y_continuous(breaks = integer_breaks()) +
  xlab("") +
  ylab("Number of clusters") +
  ggtitle("GT cluster analysis") +
  theme(strip.text.y = element_blank(),
        strip.text.x = element_text(angle = 90, face = "bold", size = 10, hjust = 0),
        strip.background = element_blank())

ggsave( "GT_cluster.svg", last_plot(), units = "cm", width = 21, height = 29.7)